{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Constructing independent variables: Mentioned authority? Mentioned researcher? Mentioned politician? Mentioned physician?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'spacy'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[18], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspacy\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mspacy\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m displacy\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcollections\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Counter\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'spacy'"
     ]
    }
   ],
   "source": [
    "import spacy\n",
    "from spacy import displacy\n",
    "from collections import Counter\n",
    "import en_core_web_sm\n",
    "import json \n",
    "import pandas as pd\n",
    "\n",
    "import time\n",
    "import requests\n",
    "\n",
    "import string\n",
    "import re\n",
    "\n",
    "import random\n",
    "\n",
    "import csv\n",
    "\n",
    "from SPARQLWrapper import SPARQLWrapper, JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "nlp = en_core_web_sm.load()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Getting named entities from tweet text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extractNE(text, excluded_NERs):\n",
    "    NEs = list()\n",
    "    doc = nlp(text)\n",
    "    for X in doc.ents: \n",
    "        if X.label_ not in excluded_NERs: \n",
    "            NEs.append((X.text, X.label_)) \n",
    "    len_text = len(text)\n",
    "    return NEs, len_text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "excluded_NERs = ['DATE','TIME','PERCENT','MONEY','QUANTITY','ORDINAL','CARDINAL']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getNEtwts(twts):\n",
    "    result_twts = list()\n",
    "    for twt in twts:\n",
    "        result_twt = dict()\n",
    "        if 'text' in twt.keys(): \n",
    "            text_key = 'text'\n",
    "        else:\n",
    "            text_key = 'full_text'\n",
    "        NEs, len_twt = extractNE(twt[text_key], excluded_NERs)\n",
    "        result_twt['id'] = twt['id']\n",
    "        result_twt['NEs'] = NEs\n",
    "        result_twt['len'] = len_twt\n",
    "        result_twts.append(result_twt)\n",
    "    return result_twts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ExcludeTwtKeywords(twts, target_keywords): \n",
    "    result_twts = list()\n",
    "    for twt in twts: \n",
    "        if sum([(k in target_keywords) for k in twt['keywords']]) < len(twt['keywords']):\n",
    "            result_twts.append(twt)\n",
    "    return result_twts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('anti_vax.json') as f:\n",
    "    anti_twts = json.load(f)\n",
    "anti_twts_cleaned = ExcludeTwtKeywords(anti_twts, ['plot','plots','viz','vis'])\n",
    "\n",
    "result_twts_anti = getNEtwts(anti_twts_cleaned)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results/NEs_anti.json','w') as f:\n",
    "    json.dump(result_twts_anti,f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('pro_vax.json') as f:\n",
    "    pro_twts = json.load(f)\n",
    "pro_twts_cleaned = ExcludeTwtKeywords(pro_twts, ['plot','plots','viz','vis'])\n",
    "\n",
    "result_twts_pro = getNEtwts(pro_twts_cleaned)\n",
    "\n",
    "with open('results/NEs_pro.json','w') as f:\n",
    "    json.dump(result_twts_pro,f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def calculateNEpercent(result_twts):\n",
    "    NE_percents = list()\n",
    "    for t in result_twts:\n",
    "        NE_percent = dict()\n",
    "        percent = len(t['NEs'])/t['len']\n",
    "        NE_percent['id'] = t['id']\n",
    "        NE_percent['NE_percent'] = percent\n",
    "        NE_percents.append(NE_percent)\n",
    "    return NE_percents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_NEcounts_anti = pd.DataFrame(calculateNEpercent(result_twts_anti))\n",
    "df_NEcounts_anti['type'] = 'anti'\n",
    "df_NEcounts_pro = pd.DataFrame(calculateNEpercent(result_twts_pro))\n",
    "df_NEcounts_pro['type'] = 'pro'\n",
    "\n",
    "df_NEcounts_anti.append(df_NEcounts_pro).to_csv('results/NE_percents.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mapping named entities to authority figures from Wikidata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "agent_={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '\n",
    "'AppleWebKit/537.11 (KHTML, like Gecko) '\n",
    "'Chrome/23.0.1271.64 Safari/537.11',\n",
    "'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',\n",
    "'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',\n",
    "'Accept-Encoding': 'none',\n",
    "'Accept-Language': 'en-US,en;q=0.8',\n",
    "'Connection': 'keep-alive'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sparql = SPARQLWrapper(\"https://query.wikidata.org/sparql\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_occus_dict = {\n",
    "\"Q39631\":\"physician\",\n",
    "\"Q1650915\":\"researcher\",\n",
    "\"Q82955\":\"politician\"}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def findTargetOcc(ent_name, target_occus_dict):\n",
    "    if ent_name == \"Donald Trump\":\n",
    "        ent_name = \"Donald J. Trump\"\n",
    "    \n",
    "    target_occ_list = list()\n",
    "    \n",
    "        \n",
    "    sparql.setQuery(\"\"\"\n",
    "            SELECT distinct ?item ?occupation ?occupationLabel WHERE{  \n",
    "              ?item ?label \"%s\"@en.  \n",
    "              ?article schema:about ?item .\n",
    "              ?article schema:inLanguage \"en\" .\n",
    "              ?article schema:isPartOf <https://en.wikipedia.org/>. \n",
    "              ?item wdt:P106 ?occupation .\n",
    "              SERVICE wikibase:label { bd:serviceParam wikibase:language \"en\". }    \n",
    "        }\n",
    "    \"\"\" %ent_name)\n",
    "    sparql.setReturnFormat(JSON)\n",
    "    \n",
    "    try: \n",
    "        results = sparql.query().convert()\n",
    "        results_df = pd.io.json.json_normalize(results['results']['bindings'])\n",
    "\n",
    "        if len(results_df) > 0:\n",
    "            occupation_list = results_df['occupationLabel.value'].to_list()\n",
    "            for occ in occupation_list: \n",
    "                #occ_id = url.replace('http://www.wikidata.org/entity/','')\n",
    "                if occ in target_occus_dict.values():\n",
    "                    #occ = target_occus_dict[occ_id]\n",
    "                    target_occ_list.append(occ)\n",
    "    \n",
    "    except Exception as e:\n",
    "        #print(e)\n",
    "        target_occ_list = list()\n",
    "    \n",
    "    \n",
    "    return target_occ_list\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def AddTargetOccusToList(data, target_occus_dict):\n",
    "    occus_list = list() \n",
    "    for item in data: \n",
    "        ent_list = item['NEs']\n",
    "        \n",
    "        if len(ent_list) > 0: \n",
    "            target_occ_list = list()\n",
    "            for NE in ent_list:\n",
    "                new_target_occ_list = findTargetOcc(NE, target_occus_dict)\n",
    "                target_occ_list = list(set(target_occ_list + new_target_occ_list))\n",
    "            if len(target_occ_list) > 0:\n",
    "                occus = {'id': item['id'], 'occupations':target_occ_list}\n",
    "                occus_list.append(occus)\n",
    "    return occus_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the bearer token from an environment variable\n",
    "bearer_token = os.environ.get(\"BEARER_TOKEN\")\n",
    "if not bearer_token:\n",
    "    raise RuntimeError(\"Missing BEARER_TOKEN environment variable.\")\n",
    "\n",
    "SEARCH_URL = \"https://api.twitter.com/2/tweets/search/all\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bearer_oauth(r):\n",
    "    \"\"\"\n",
    "    Method required by bearer token authentication.\n",
    "    \"\"\"\n",
    "\n",
    "    r.headers[\"Authorization\"] = f\"Bearer {bearer_token}\"\n",
    "    r.headers[\"User-Agent\"] = \"v2FullArchiveSearchPython\"\n",
    "    return r\n",
    "\n",
    "\n",
    "def connect_to_endpoint(url, params):\n",
    "    response = requests.request(\"GET\", search_url, auth=bearer_oauth, params=params)\n",
    "    #print(response.status_code)\n",
    "    if response.status_code != 200:\n",
    "        raise Exception(response.status_code, response.text)\n",
    "    return response.json()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "def truncExpBackOff(n): \n",
    "    if n>6: \n",
    "        n = 6\n",
    "    sleep_time = pow(2,n)+random.random()\n",
    "    if sleep_time >64: \n",
    "        sleep_time = 64\n",
    "    time.sleep(sleep_time)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#In cases where the detected entity was a Twitter account handle, \n",
    "#we queried Twitter to get the name associated with the account.\n",
    "\n",
    "def twtHandle2Name(twt_handle):\n",
    "    n = 0 \n",
    "    while True: \n",
    "        try:\n",
    "            search_result = connect_to_endpoint('https://api.twitter.com/2/users/by?usernames=' + twt_handle) \n",
    "            if 'data' in search_result.keys():\n",
    "                name = search_result['data'][0]['name']\n",
    "            else: \n",
    "                name = 'non-existing handle'\n",
    "            break \n",
    "        except Exception as e: \n",
    "            code, text = e.args\n",
    "            if int(code) == 429:\n",
    "                n += 1\n",
    "                truncExpBackOff(n)\n",
    "                continue \n",
    "            else: \n",
    "                name = 'non-existing handle'\n",
    "                break\n",
    "    return name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parseTwtStrings(s): \n",
    "    l_temp = list()\n",
    "    for m in re.findall(r'(?=((\\@|\\#)(.*?)( |\\@|\\#|$)))', s): \n",
    "        l_temp.append(m[0])\n",
    "\n",
    "    for item in l_temp:\n",
    "        if item[-1] == ('@' or '#'):\n",
    "            item = item[0:-1]\n",
    "        s = s.replace(item,'')\n",
    "\n",
    "    if len(s) > 0: \n",
    "        l_temp.append(s.strip())\n",
    "\n",
    "    l_clean = list()\n",
    "    for token in l_temp:\n",
    "        #print(token)\n",
    "        if len(token) > 0 and token[-1] == ('@' or '#' or ' '):\n",
    "            token = token[0:-1]\n",
    "        l_clean.append(token)\n",
    "    \n",
    "    return l_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleanEnt(s):\n",
    "    \n",
    "    parsed_list = parseTwtStrings(s)\n",
    "    \n",
    "    cleaned_ent_list = list()\n",
    "    \n",
    "    if len(parsed_list) > 0: \n",
    "        for ent in parsed_list: \n",
    "            if len(ent) > 0: \n",
    "                if ent[0] == '@':\n",
    "                    twt_handle = ent[1:]\n",
    "                    name = twtHandle2Name(twt_handle)\n",
    "                elif ent[0] == '#':\n",
    "                    name = ent[1:]\n",
    "                elif 'https://' in ent.lower():\n",
    "                    continue\n",
    "                else: \n",
    "                    name = ent\n",
    "                name = string.capwords(name) \n",
    "                cleaned_ent_list.append(name)\n",
    "        \n",
    "    return cleaned_ent_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "def makeCleanEntDict(twt_ents_list):\n",
    "    result_twts = list()\n",
    "    for twt in twt_ents_list: \n",
    "        result_twt = dict()\n",
    "        clean_ent_list = list()\n",
    "        if len(twt['NEs']) > 0:\n",
    "            for ent in twt['NEs']: \n",
    "                clean_ents = cleanEnt(ent[0])\n",
    "                clean_ent_list = clean_ent_list + clean_ents\n",
    "        result_twt['id'] = twt['id']\n",
    "        result_twt['NEs'] = clean_ent_list\n",
    "        result_twts.append(result_twt)\n",
    "    return result_twts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results/NEs_pro.json') as f:\n",
    "    pro = json.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results/NEs_anti.json') as f:\n",
    "    anti = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "pro_clean = makeCleanEntDict(pro)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/SOC.NORTHWESTERN.EDU/rcj7738/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead\n"
     ]
    }
   ],
   "source": [
    "occus_list_pro = AddTargetOccusToList(pro_clean, target_occus_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results/occ_NEs_pro.json', 'w') as f:\n",
    "    json.dump(occus_list_pro, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/SOC.NORTHWESTERN.EDU/rcj7738/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:22: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead\n"
     ]
    }
   ],
   "source": [
    "anti_clean = makeCleanEntDict(anti)\n",
    "occus_list_anti = AddTargetOccusToList(anti_clean, target_occus_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('results/occ_NEs_anti.json', 'w') as f:\n",
    "    json.dump(occus_list_anti, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"results/occ_NEs_anti.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    anti_occ_items = json.load(f)\n",
    "\n",
    "with open(\"results/occ_NEs_pro.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    pro_occ_items = json.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "anti_occ_by_id = {\n",
    "    str(item[\"id\"]): set(item.get(\"occupations\", []))\n",
    "    for item in anti_occ_items\n",
    "}\n",
    "\n",
    "pro_occ_by_id = {\n",
    "    str(item[\"id\"]): set(item.get(\"occupations\", []))\n",
    "    for item in pro_occ_items\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"results/authority_figures.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as out:\n",
    "    writer = csv.writer(out)\n",
    "    writer.writerow([\"id\", \"researcher\", \"physician\", \"politician\", \"type\"])\n",
    "\n",
    "    # Anti tweets\n",
    "    for t in anti_twts_cleaned:\n",
    "        tid = str(t.get(\"id\", \"\"))\n",
    "\n",
    "        occs = anti_occ_by_id.get(tid, set())\n",
    "\n",
    "        writer.writerow([\n",
    "            tid,\n",
    "            1 if \"researcher\" in occs else 0,\n",
    "            1 if \"physician\" in occs else 0,\n",
    "            1 if \"politician\" in occs else 0,\n",
    "            \"anti\",\n",
    "        ])\n",
    "\n",
    "    # Pro tweets\n",
    "    for t in pro_twts_cleaned:\n",
    "        tid = str(t.get(\"id\", \"\"))\n",
    "\n",
    "        occs = pro_occ_by_id.get(tid, set())\n",
    "\n",
    "        writer.writerow([\n",
    "            tid,\n",
    "            1 if \"researcher\" in occs else 0,\n",
    "            1 if \"physician\" in occs else 0,\n",
    "            1 if \"politician\" in occs else 0,\n",
    "            \"pro\",\n",
    "        ])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
